library(readr)
library(ggplot2)
library(forcats)
library(corrplot)
## corrplot 0.92 loaded
library(gridExtra)
library(RColorBrewer)


data <- read_csv("train.csv")
## New names:
## • `` -> `...1`
## Rows: 103904 Columns: 25
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (5): Gender, Customer Type, Type of Travel, Class, satisfaction
## dbl (20): ...1, id, Age, Flight Distance, Inflight wifi service, Departure/A...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(data)
## # A tibble: 6 × 25
##    ...1     id Gender `Customer Type`     Age `Type of Travel` Class   
##   <dbl>  <dbl> <chr>  <chr>             <dbl> <chr>            <chr>   
## 1     0  70172 Male   Loyal Customer       13 Personal Travel  Eco Plus
## 2     1   5047 Male   disloyal Customer    25 Business travel  Business
## 3     2 110028 Female Loyal Customer       26 Business travel  Business
## 4     3  24026 Female Loyal Customer       25 Business travel  Business
## 5     4 119299 Male   Loyal Customer       61 Business travel  Business
## 6     5 111157 Female Loyal Customer       26 Personal Travel  Eco     
## # ℹ 18 more variables: `Flight Distance` <dbl>, `Inflight wifi service` <dbl>,
## #   `Departure/Arrival time convenient` <dbl>, `Ease of Online booking` <dbl>,
## #   `Gate location` <dbl>, `Food and drink` <dbl>, `Online boarding` <dbl>,
## #   `Seat comfort` <dbl>, `Inflight entertainment` <dbl>,
## #   `On-board service` <dbl>, `Leg room service` <dbl>,
## #   `Baggage handling` <dbl>, `Checkin service` <dbl>,
## #   `Inflight service` <dbl>, Cleanliness <dbl>, …
summary(data)
##       ...1              id            Gender          Customer Type     
##  Min.   :     0   Min.   :     1   Length:103904      Length:103904     
##  1st Qu.: 25976   1st Qu.: 32534   Class :character   Class :character  
##  Median : 51952   Median : 64856   Mode  :character   Mode  :character  
##  Mean   : 51952   Mean   : 64924                                        
##  3rd Qu.: 77927   3rd Qu.: 97368                                        
##  Max.   :103903   Max.   :129880                                        
##                                                                         
##       Age        Type of Travel        Class           Flight Distance
##  Min.   : 7.00   Length:103904      Length:103904      Min.   :  31   
##  1st Qu.:27.00   Class :character   Class :character   1st Qu.: 414   
##  Median :40.00   Mode  :character   Mode  :character   Median : 843   
##  Mean   :39.38                                         Mean   :1189   
##  3rd Qu.:51.00                                         3rd Qu.:1743   
##  Max.   :85.00                                         Max.   :4983   
##                                                                       
##  Inflight wifi service Departure/Arrival time convenient Ease of Online booking
##  Min.   :0.00          Min.   :0.00                      Min.   :0.000         
##  1st Qu.:2.00          1st Qu.:2.00                      1st Qu.:2.000         
##  Median :3.00          Median :3.00                      Median :3.000         
##  Mean   :2.73          Mean   :3.06                      Mean   :2.757         
##  3rd Qu.:4.00          3rd Qu.:4.00                      3rd Qu.:4.000         
##  Max.   :5.00          Max.   :5.00                      Max.   :5.000         
##                                                                                
##  Gate location   Food and drink  Online boarding  Seat comfort  
##  Min.   :0.000   Min.   :0.000   Min.   :0.00    Min.   :0.000  
##  1st Qu.:2.000   1st Qu.:2.000   1st Qu.:2.00    1st Qu.:2.000  
##  Median :3.000   Median :3.000   Median :3.00    Median :4.000  
##  Mean   :2.977   Mean   :3.202   Mean   :3.25    Mean   :3.439  
##  3rd Qu.:4.000   3rd Qu.:4.000   3rd Qu.:4.00    3rd Qu.:5.000  
##  Max.   :5.000   Max.   :5.000   Max.   :5.00    Max.   :5.000  
##                                                                 
##  Inflight entertainment On-board service Leg room service Baggage handling
##  Min.   :0.000          Min.   :0.000    Min.   :0.000    Min.   :1.000   
##  1st Qu.:2.000          1st Qu.:2.000    1st Qu.:2.000    1st Qu.:3.000   
##  Median :4.000          Median :4.000    Median :4.000    Median :4.000   
##  Mean   :3.358          Mean   :3.382    Mean   :3.351    Mean   :3.632   
##  3rd Qu.:4.000          3rd Qu.:4.000    3rd Qu.:4.000    3rd Qu.:5.000   
##  Max.   :5.000          Max.   :5.000    Max.   :5.000    Max.   :5.000   
##                                                                           
##  Checkin service Inflight service  Cleanliness    Departure Delay in Minutes
##  Min.   :0.000   Min.   :0.00     Min.   :0.000   Min.   :   0.00           
##  1st Qu.:3.000   1st Qu.:3.00     1st Qu.:2.000   1st Qu.:   0.00           
##  Median :3.000   Median :4.00     Median :3.000   Median :   0.00           
##  Mean   :3.304   Mean   :3.64     Mean   :3.286   Mean   :  14.82           
##  3rd Qu.:4.000   3rd Qu.:5.00     3rd Qu.:4.000   3rd Qu.:  12.00           
##  Max.   :5.000   Max.   :5.00     Max.   :5.000   Max.   :1592.00           
##                                                                             
##  Arrival Delay in Minutes satisfaction      
##  Min.   :   0.00          Length:103904     
##  1st Qu.:   0.00          Class :character  
##  Median :   0.00          Mode  :character  
##  Mean   :  15.18                            
##  3rd Qu.:  13.00                            
##  Max.   :1584.00                            
##  NA's   :310
sum(is.na(data$`Arrival Delay in Minutes`))
## [1] 310
data$`Arrival Delay in Minutes`[is.na(data$`Arrival Delay in Minutes`)] <- median(data$`Arrival Delay in Minutes`, na.rm = TRUE)

#duplicate
sum(duplicated(data))
## [1] 0
categorical_vars_ggplot <- c('Gender', '`Customer Type`', '`Type of Travel`', 'Class', 'satisfaction')

plot_list <- list()

for (cat_var in categorical_vars_ggplot) {
  plot_obj <- ggplot(data, aes_string(x = cat_var, fill = cat_var)) + 
    geom_bar() +
    geom_text(stat='count', aes_string(label='..count..', y='..count..'), vjust=-0.5) +
    labs(title = paste("Distribution of", cat_var), x = cat_var, y = "Count") +
    scale_fill_brewer(palette="Set3") + 
    theme_minimal() +  
    theme(legend.position="none")
  
  plot_list[[cat_var]] <- plot_obj
}
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
grid.arrange(grobs = plot_list, ncol = 2)
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

numerical_vars_ggplot <- c('Age', '`Flight Distance`', '`Inflight wifi service`', '`Departure/Arrival time convenient`', '`Ease of Online booking`', '`Gate location`', '`Food and drink`', '`Online boarding`', '`Seat comfort`', '`Inflight entertainment`', '`On-board service`', '`Leg room service`', '`Baggage handling`', '`Checkin service`', '`Inflight service`', 'Cleanliness', '`Departure Delay in Minutes`', '`Arrival Delay in Minutes`')
library(ggplot2)


barplot_vars <- c()  
histogram_vars <- c() 

for (num_var in numerical_vars_ggplot) {
  
  unique_vals <- length(unique(data[[gsub("`", "", num_var)]]))
  
  if (unique_vals < 10) {
    barplot_vars <- c(barplot_vars, num_var)
  } else {
    histogram_vars <- c(histogram_vars, num_var)
  }
}

barplot_vars
##  [1] "`Inflight wifi service`"             "`Departure/Arrival time convenient`"
##  [3] "`Ease of Online booking`"            "`Gate location`"                    
##  [5] "`Food and drink`"                    "`Online boarding`"                  
##  [7] "`Seat comfort`"                      "`Inflight entertainment`"           
##  [9] "`On-board service`"                  "`Leg room service`"                 
## [11] "`Baggage handling`"                  "`Checkin service`"                  
## [13] "`Inflight service`"                  "Cleanliness"
histogram_vars
## [1] "Age"                          "`Flight Distance`"           
## [3] "`Departure Delay in Minutes`" "`Arrival Delay in Minutes`"
for (num_var in numerical_vars_ggplot) {
  
  unique_vals <- length(unique(data[[gsub("`", "", num_var)]]))
  

  if (unique_vals < 10) {
    plot_obj <- ggplot(data, aes_string(x = num_var)) + 
      geom_bar() +
    geom_text(stat='count', aes_string(label='..count..', y='..count..'), vjust=-0.5) +
      labs(title = paste("Distribution of", num_var), x = num_var, y = "Frequency")+
      scale_fill_brewer(palette="Set3") + 
    theme_minimal() +  
    theme(legend.position="none") 
  } else {
    plot_obj <- ggplot(data, aes_string(x = num_var)) + 
      geom_histogram(bins = 40, fill = "skyblue", color = "black", alpha = 0.7) + 
      labs(title = paste("Distribution of", num_var), x = num_var, y = "Frequency")
  }
  
  print(plot_obj)
}

sample

continuous_vars <- c('Age', '`Flight Distance`', '`Departure Delay in Minutes`', '`Arrival Delay in Minutes`')

continuous_plots <- list()

for (num_var in continuous_vars) {
    plot_obj <- ggplot(data, aes_string(x = num_var)) + 
      geom_histogram(bins = 40, fill = "skyblue", color = "black", alpha = 0.7) + 
      labs(title = paste("Distribution of", num_var), x = num_var, y = "Frequency") +
      theme_minimal()
    
    continuous_plots <- append(continuous_plots, list(plot_obj))
}

if (length(continuous_plots) > 0) {
  grid.arrange(grobs = continuous_plots, ncol = 2)
}

rating_vars <- c('`Inflight wifi service`', '`Departure/Arrival time convenient`', '`Ease of Online booking`', '`Gate location`', '`Food and drink`', '`Online boarding`', '`Seat comfort`', '`Inflight entertainment`', '`On-board service`', '`Leg room service`', '`Baggage handling`', '`Checkin service`', '`Inflight service`', 'Cleanliness')

rating_plots <- list()

for (rate_var in rating_vars) {
    plot_obj <- ggplot(data, aes_string(x = rate_var)) + 
      geom_bar(aes(fill = get(rate_var))) +
      geom_text(stat='count', aes_string(label='..count..', y='..count..'), vjust=0, size = 20) +
      labs(title = paste("Distribution of", rate_var), x = rate_var, y = "Frequency")+
      scale_fill_brewer(palette="Set3") + 
      theme_minimal() +  
      theme(legend.position="none",
            
        plot.title = element_text(size = 54, face = "bold"),
        axis.title.x = element_text(size = 52, face = "bold"),  
        axis.title.y = element_text(size = 52, face = "bold"),
        axis.text.x = element_text(size = 50),
        axis.text.y = element_text(size = 50))
    
    rating_plots <- append(rating_plots, list(plot_obj))
}

if (length(rating_plots) > 0) {
  grid.arrange(grobs = rating_plots, ncol = 2)
}
## Warning: The following aesthetics were dropped during statistical transformation: fill
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
## The following aesthetics were dropped during statistical transformation: fill
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
## The following aesthetics were dropped during statistical transformation: fill
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
## The following aesthetics were dropped during statistical transformation: fill
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
## The following aesthetics were dropped during statistical transformation: fill
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
## The following aesthetics were dropped during statistical transformation: fill
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
## The following aesthetics were dropped during statistical transformation: fill
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
## The following aesthetics were dropped during statistical transformation: fill
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
## The following aesthetics were dropped during statistical transformation: fill
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
## The following aesthetics were dropped during statistical transformation: fill
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
## The following aesthetics were dropped during statistical transformation: fill
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
## The following aesthetics were dropped during statistical transformation: fill
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
## The following aesthetics were dropped during statistical transformation: fill
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
## The following aesthetics were dropped during statistical transformation: fill
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

Correlation Analysis for numerical variables

numerical_vars_correlation <- gsub("`", "", numerical_vars_ggplot)

correlation_matrix <- cor(data[numerical_vars_correlation])
options(repr.plot.width=100, repr.plot.height=80)
corrplot(correlation_matrix, method = "color", type = "upper", order = "hclust", 
         tl.col = "black", tl.srt = 90, addCoef.col = "black", number.cex = 0.9, 
         cl.cex = 0.9)

Many service ratings (e.g., Online boarding, Seat comfort, Inflight entertainment, etc.) are moderately correlated with each other. This is expected as a passenger’s overall experience can influence their ratings across multiple services.

Departure Delay in Minutes and Arrival Delay in Minutes are positively correlated, indicating that flights which depart late often arrive late.

Identifying Outliers or Anomalies in the Data

selected_data <- data[c("Age", "Flight Distance", "Departure Delay in Minutes", "Arrival Delay in Minutes")]


outliers_counts_selected <- sapply(colnames(selected_data), function(var) {
  
  Q1 <- quantile(selected_data[[var]], 0.25, na.rm = TRUE)
  Q3 <- quantile(selected_data[[var]], 0.75, na.rm = TRUE)
  IQR <- Q3 - Q1
  
  lower_bound <- Q1 - 1.5 * IQR
  upper_bound <- Q3 + 1.5 * IQR
  
  outliers <- which((selected_data[[var]] < lower_bound) | 
                    (selected_data[[var]] > upper_bound))
  
  length(outliers)
})

outliers_counts_selected
##                        Age            Flight Distance 
##                          0                       2291 
## Departure Delay in Minutes   Arrival Delay in Minutes 
##                      14529                      13954

Here are the number of potential outliers identified for each continuous numerical variable using the IQR method

For our dataset:


Age: No outliers, so no action needed.

Flight Distance: Given that it’s plausible for some flights to have longer distances, capping might be a better approach than outright removal.

Departure/Arrival Delay in Minutes: Delays can vary significantly, with occasional very long delays. Instead of removing these values, it might be more beneficial to apply a log transformation to reduce the skewness and impact of extreme values.

Note : We’ll add 1 before applying the log transformation to handle instances with a delay of 0 minutes.


Q1_fd <- quantile(data$`Flight Distance`, 0.25)
Q3_fd <- quantile(data$`Flight Distance`, 0.75)
IQR_fd <- Q3_fd - Q1_fd

lower_cap_fd <- Q1_fd - 1.5 * IQR_fd
upper_cap_fd <- Q3_fd + 1.5 * IQR_fd

data$`Flight Distance` <- pmin(pmax(data$`Flight Distance`, lower_cap_fd), upper_cap_fd)


data$`Departure Delay in Minutes` <- log1p(data$`Departure Delay in Minutes`)
data$`Arrival Delay in Minutes` <- log1p(data$`Arrival Delay in Minutes`)


head(data[c("Flight Distance", "Departure Delay in Minutes", "Arrival Delay in Minutes")])
## # A tibble: 6 × 3
##   `Flight Distance` `Departure Delay in Minutes` `Arrival Delay in Minutes`
##               <dbl>                        <dbl>                      <dbl>
## 1               460                        3.26                        2.94
## 2               235                        0.693                       1.95
## 3              1142                        0                           0   
## 4               562                        2.48                        2.30
## 5               214                        0                           0   
## 6              1180                        0                           0
categorical_features <- c('Gender', '`Customer Type`', '`Type of Travel`', 'Class')

par(mfrow=c(2,2), mar=c(4,4,2,2))

for (feature in categorical_features) {
  p <- ggplot(data, aes_string(x=feature, fill='satisfaction')) +
    geom_bar(position="dodge") +
    geom_text(stat='count', aes(label=..count..), vjust=-0.5, position=position_dodge(width=0.9)) +
    labs(title = paste("Distribution of", gsub("`", "", feature), "by Satisfaction"), x = gsub("`", "", feature), y = "Count") +
    scale_fill_brewer(palette="Set3") +
    theme_minimal() +
    theme(legend.position="top")
  print(p)
}

Observtions


Gender: Both genders have a fairly similar distribution of satisfaction levels. Customer Type: Loyal customers tend to be more satisfied than disloyal ones. Type of Travel: Passengers traveling for business purposes are generally more satisfied than those traveling for personal reasons. Class: Business class passengers are noticeably more satisfied than those in Eco or Eco Plus.


Distribution of numerical features by satisfaction - KDE (Kernel Density Estimation)

numerical_features <- c('Age', '`Flight Distance`', '`Departure Delay in Minutes`', '`Arrival Delay in Minutes`')

par(mfrow=c(2,2), mar=c(4,4,2,2))

for (feature in numerical_features) {
  p <- ggplot(data, aes_string(x=feature, fill='satisfaction')) +
    geom_density(alpha=0.5, position="identity") +
    labs(title = paste("Distribution of", feature, "by Satisfaction"), x = feature, y = "Density") +
    scale_fill_manual(values=c("satisfied"="green", "neutral or dissatisfied"="red")) +
    theme_minimal() +
    theme(legend.position="top")
  print(p)
}

Observations


Age: Younger passengers tend to be more neutral or dissatisfied, while older passengers lean more towards satisfaction.

Flight Distance: Passengers traveling shorter distances seem to be more neutral or dissatisfied compared to those traveling longer distances.

Departure Delay in Minutes: Although the distributions overlap considerably, there’s a slightly higher density of neutral or dissatisfied passengers with longer departure delays.

Arrival Delay in Minutes: Similar to departure delays, passengers with longer arrival delays tend to be more neutral or dissatisfied.


Visualize the relationship between Age and Flight Distance colored by satisfaction.

# just 5000 rows
data_sample <- data[sample(nrow(data), 5000), ]

# Scatter plot
ggplot(data_sample, aes(x=Age, y=`Flight Distance`, color=satisfaction)) +
  geom_point(alpha=0.7) +
  scale_color_manual(values=c("neutral or dissatisfied"="red", "satisfied"="green")) +
  labs(title="Relationship between Age and Flight Distance by Satisfaction") +
  theme_minimal()


There’s a spread of satisfied and neutral/dissatisfied customers across various ages and flight distances.

Older passengers who travel longer distances seem to be predominantly satisfied.

Younger passengers, especially those traveling shorter distances, display a mix of satisfaction levels.


Chi-Squared Test

We will assess the association association between each categorical variable and the satisfaction target variable.

Null Hypothesis : There is no association between the categorical variable and the satisfaction of passengers. alternative hypothesis : There is a statistically significant association between the categorical variable and the satisfaction of passengers.

categorical_vars <- c('Gender', 'Customer Type', 'Type of Travel', 'Class')

perform_chi2_test <- function(feature) {
  contingency_table <- table(data[[feature]], data$satisfaction)
  chi2_test_result <- chisq.test(contingency_table)
  return(chi2_test_result$p.value)
}

chi2_p_values <- sapply(categorical_vars, perform_chi2_test)

names(chi2_p_values) <- categorical_vars
chi2_p_values
##         Gender  Customer Type Type of Travel          Class 
##   8.496755e-05   0.000000e+00   0.000000e+00   0.000000e+00

For all these variables, the p-values are extremely small, indicating that there is a statistically significant association between the categorical variable and the target variable satisfaction.

Given the extremely small p-values for all the categorical variables, we can reject the null hypothesis for each of them. This implies that there’s a statistically significant association between each of these categorical variables (Gender, Customer Type, Type of Travel, and Class) and passenger satisfaction.

In simpler terms, the likelihood of a passenger being satisfied (or not) is not independent of their gender, customer type, type of travel, or class. Each of these factors plays a role in determining their satisfaction level.


T-test

Null Hypothesis : There is no difference in the means of “Arrival Delay in Minutes” between the two satisfaction groups (satisfied and neutral or dissatisfied).

alternative hypothesis : There is a significant difference in the means of “Arrival Delay in Minutes” between the two satisfaction groups.

We will perform a two-sample t-test to determine if the means of “Arrival Delay in Minutes” for the two satisfaction groups are statistically different.

group1_arrival_delay <- data$`Arrival Delay in Minutes`[data$satisfaction == "satisfied"]
group2_arrival_delay <- data$`Arrival Delay in Minutes`[data$satisfaction == "neutral or dissatisfied"]

t_test_result_arrival_delay <- t.test(group1_arrival_delay, group2_arrival_delay)


t_test_result_arrival_delay
## 
##  Welch Two Sample t-test
## 
## data:  group1_arrival_delay and group2_arrival_delay
## t = -33.162, df = 100286, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.3528694 -0.3134851
## sample estimates:
## mean of x mean of y 
##  1.072264  1.405441

Given the very low p-value, we can reject the null hypothesis. This means there is a statistically significant difference in the means of “Arrival Delay in Minutes” between passengers who are satisfied and those who are neutral or dissatisfied. Specifically, passengers who are satisfied seem to experience, on average, a shorter arrival delay compared to those who are neutral or dissatisfied.